library("GEOquery")
library("readr")
library("here")
library("tidyverse")00_all
Master Document - Breast Cancer Response Analysis
Load Data
Load libraries
Save raw data
gse <- getGEO("GSE20194",
GSEMatrix = TRUE,
getGPL = FALSE)Found 1 file(s)
GSE20194_series_matrix.txt.gz
save(gse,
file = here("data/_raw/raw_data.RData"))Load raw data
load(here("data/_raw/raw_data.RData"))
eset <- gse[[1]]Preliminary processing
Convert the ExpressionSet object to a data frame and handle missing or blank values to prevent misalignment.
# Extract sample metadata and add a sample column
# Using %>% instead of |> because %>% allows using '.' as placeholder
dat_load <- as.data.frame(pData(eset)) %>%
mutate(
sample_id = rownames(.)
) %>%
relocate(sample_id) %>%
mutate(
across(where(is.factor), as.character)
) %>%
mutate(
across(everything(),
~ ifelse(. == "" | is.na(.), NA_character_, .))
)Save load data
write_tsv(
dat_load,
here("data", "01_dat_load.tsv"),
na = "NA")Clean Data
Load libraries
library("tidyverse")
library("here")Load data
input_file <- here("data/01_dat_load.tsv")
dat_load <- read_tsv(
input_file,
na = c("",
"NA"),
show_col_types = FALSE)Clean data
# Select columns with core characteristics
dat_core <- dat_load |>
select(
geo_accession,
matches("^characteristics_ch1\\.([1-9]|1[0-4])$"))dat_clean <- dat_core |>
# Pivot all characteristics_ch1.* columns longer
pivot_longer(
cols = starts_with("characteristics_ch1."),
names_to = "char_num",
values_to = "char_value") |>
# Separate colname and value by ": "
separate(
char_value,
into = c("colname",
"value"),
sep = ":\\s+") |>
select(
-("char_num")) |>
# Handle duplicate entries by keeping the first value
pivot_wider(
names_from = colname,
values_from = value,
values_fn = function(x) x[1]) |>
select(
-`treatments comments`,
-`NA`) |>
# Replace 'ND' with NA
mutate(
across(everything(),
~na_if(.,
"ND")))# Unify variants in histology
dat_clean <- dat_clean |>
mutate(
histology = case_when(
histology %in% c("ILC/IDC", "IDC+ILC") ~ "IDC/ILC",
histology %in% c("IDC+DCIS", "IDC/DCIS", "IDC + DCIS") ~ "IDC/DCIS",
histology %in% c("IDC/anaplastic", "IDC, medullary features") ~ "IDC",
TRUE ~ histology
))Rename columns
dat_clean <- dat_clean |>
rename(
sample_id = geo_accession,
response = pcr_vs_rd,
tumor_size_before = tbefore,
nodes_before = nbefore,
grade = bmngrd,
er_percent = er,
her2_status = `her2 status`,
her2_ihc = `her2 ihc`,
her2_fish = `her2 fish`,
treatment = `treatment code`)Convert data types
dat_clean <- dat_clean |>
mutate(
# factor varaibles
er_status = factor(er_status,
levels = c("N",
"P")),
pr_status = factor(pr_status,
levels = c("N",
"P")),
response = factor(response,
levels = c("RD",
"pCR")),
race = factor(race),
her2_status = factor(her2_status,
levels = c("N",
"P")),
her2_ihc = factor(her2_ihc,
levels = c("0",
"1",
"2",
"3")),
histology = factor(histology),
treatment = factor(treatment),
# numeric variables
age = as.numeric(age),
tumor_size_before = as.numeric(tumor_size_before),
nodes_before = as.numeric(nodes_before),
grade = as.numeric(grade),
er_percent = as.numeric(er_percent),
her2_fish = as.numeric(her2_fish))Save clean data
write_tsv(
dat_clean,
here("data",
"02_dat_clean.tsv"),
na = "NA")Augment Data
Load libraries
library("tidyverse")
library("here")Load data
input_file <- here("data/02_dat_clean.tsv")
dat_clean <- read_tsv(
input_file,
na = c("",
"NA"),
show_col_types = FALSE)Augment data
dat_aug <- dat_clean |>
mutate(
er_num = if_else(
er_status == "P",
1,
0),
pr_num = if_else(
pr_status == "P",
1,
0),
her2_num = if_else(
her2_status == "P",
1,
0),
# hormone receptor groups
hormone_group = paste0(
if_else(er_num == 1,
"ER+",
"ER-"),
if_else(pr_num == 1,
"PR+",
"PR-")),
# three receptor combination
receptor_group = paste0(
if_else(er_num == 1,
"ER+",
"ER-"),
if_else(pr_num == 1,
"PR+",
"PR-"),
if_else(her2_num == 1,
"HER2+",
"HER2-")),
# triple negative
triple_negative = case_when(
er_num == 0 & pr_num == 0 & her2_num == 0 ~ 1, # TNBC
TRUE ~ 0))Save augment data
write_tsv(
dat_aug,
here("data",
"03_dat_aug.tsv"),
na = "NA")Describe Data
Load Libraries
library("tidyverse")
library("here")
library("RColorBrewer")
source(here("R/99_proj_func.R"))Load Data
Load the data as a variable to use in descriptive graphs
augment_data <-
read_tsv(file = here("data/03_dat_aug.tsv"))Descriptive Graphs
custom_palette <- c(
"#7f3667",
# "#a53e76",
"#bb437e",
# "#d24787",
"#e44b8d",
"#e2619f",
# "#e27bb1",
"#e5a0c6",
"lightblue",
"#B3D84C",
# "#A6D15A",
"#90C84C",
"#70B83F",
"#4F9A2F",
# "#2F7B1F",
"#1D6510",
"#004400")Distribution of age groups and race
df_plot1 <- augment_data |>
mutate(
age_group = cut(
age,
breaks = seq(0,
100,
by = 10),
labels = paste(seq(0, 90, 10),
seq(9, 99, 10),
sep = "-"))) |>
filter(
!is.na(age_group)) |>
count(age_group,
race)
plot1_pallete <- c("#e44b8d",
"#e5a0c6",
"lightblue",
"#B3D84C",
"#70B83F")
gg_4_1 <- ggplot(
df_plot1,
aes(
x = age_group,
y = n,
fill = race)) +
geom_col(position = "stack") +
labs(
title = "Distribution of Age Groups and Race",
x = "Age groups",
y = "Number of subjects",
fill = "Races") +
scale_fill_manual(values = plot1_pallete) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
gg_4_1Distribution of hormones in different patients
df_plot2 <- augment_data |>
mutate(
receptor_group = case_when(
er_status == "N" & pr_status == "N" & her2_status == "N" ~ "ER-PR-HER2-",
er_status == "P" & pr_status == "N" & her2_status == "N" ~ "ER+PR-HER2-",
er_status == "N" & pr_status == "P" & her2_status == "N" ~ "ER-PR+HER2-",
er_status == "N" & pr_status == "N" & her2_status == "P" ~ "ER-PR-HER2+",
er_status == "P" & pr_status == "P" & her2_status == "N" ~ "ER+PR+HER2-",
er_status == "P" & pr_status == "N" & her2_status == "P" ~ "ER+PR-HER2+",
er_status == "N" & pr_status == "P" & her2_status == "P" ~ "ER-PR-HER2-",
er_status == "P" & pr_status == "P" & her2_status == "P" ~ "ER+PR+HER2+")) |>
count(receptor_group)
plot2_palette <- c("#7f3667",
"#e44b8d",
"#e5a0c6",
"lightblue",
"#B3D84C",
"#70B83F",
"#2F7B1F")
gg_4_2 <- ggplot(
df_plot2,
aes(
x = receptor_group,
y = n,
fill = receptor_group)) +
geom_col() +
labs(
title = "Distribution of Hormones",
x = "Receptor groups",
y = "Number of occurances",
fill = "Receptor groups") +
scale_fill_manual(values = plot2_palette) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 30,
hjust = 0.9,
vjust = 1.2),
plot.title = element_text(hjust = 0.5))
gg_4_2Distribution of successful or unsuccessful treatment
df_plot3 <- augment_data |>
filter(
!is.na(response)) |>
count(response)
plot3_palette <- c("#e44b8d",
"#90C84C")
gg_4_3 <- ggplot(
df_plot3,
aes(
x = response,
y = n,
fill = response)) +
geom_col() +
labs(
title = "Distribution of Treatment Success:
pCR = 'no detectable tumor after treatment'
RD = 'tumor remains after therapy'",
x = "Response type",
y = "Number of cases",
fill = "Response type") +
scale_fill_manual(values = plot3_palette) +
theme_minimal()
gg_4_3Distribution of tumor subtypes (histology)
df_plot4 <- augment_data |>
filter(
!is.na(histology)) |>
count(histology)
gg_4_4 <- ggplot(
df_plot4,
aes(
x = histology,
y = n,
fill = histology)) +
geom_col() +
labs(
title = "Distribution of Tumor Subtypes (Histology)",
x = "Histology",
y = "Number of cases",
fill = "Histology") +
scale_fill_manual(values = custom_palette) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45,
hjust = 1,
vjust = 1),
plot.title = element_text(hjust = 0.5))
gg_4_4Distribution of tumor aggressiveness (bmngrd = Bloom–Richardson (BR) grade)
df_plot5 <- augment_data |>
mutate(grade = factor(grade)) |>
count(grade)
plot5_palette <- c("#e44b8d",
"lightblue",
"#90C84C")
gg_4_5 <- ggplot(
df_plot5,
aes(
x = grade,
y = n,
fill = grade)) +
geom_col() +
labs(
title = "Distribution of Tumor Aggressiveness (BR grade)",
x = "Grade",
y = "Number of cases",
fill = "Grade") +
scale_fill_manual(values = plot5_palette, na.value = "gray") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
gg_4_5Save plots as png
save_plot(gg_4_1, "04_key_plot_1.png")
save_plot(gg_4_2, "04_plot_2.png")
save_plot(gg_4_3, "04_plot_3.png")
save_plot(gg_4_4, "04_plot_4.png")
save_plot(gg_4_5, "04_plot_5.png")Analysis
Loading Libraries
library("here")
library("tidyverse")
source(here("R/99_proj_func.R"))Reading the Data
data_path <- "data/03_dat_aug.tsv"
dataset <- read_tsv(file = here(data_path))Data Wrangling
dataset <- dataset |>
mutate(
# creating a new column for the combinations of E and P receptor statuses
er_pr_status = case_when(
er_status == "P" & pr_status == "P" ~ "ER Positive, PR Positive",
er_status == "P" & pr_status == "N" ~ "ER Positive, PR Negative",
er_status == "N" & pr_status == "P" ~ "ER Negative, PR Positive",
er_status == "N" & pr_status == "N" ~ "ER Negative, PR Negative"),
# making grade into a factor -> needed for coloring in graphs
grade = factor(grade, levels = c(1, 2, 3), exclude = NULL))Plotting
ER & PR Status vs Aggressiveness
receptors_vs_grade <- ggplot(
dataset,
aes(
x = er_pr_status,
fill = grade)) +
geom_bar(stat = "count",
position = "stack") +
scale_fill_manual(
values = c("1" = "pink",
"2" = "red",
"3" = "darkred",
"NA" = "gray")) +
labs(
title = "ER & PR Status vs Aggressiveness",
x = "Receptor Status Combination",
y = "Amount",
fill = "Grade") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45,
hjust = 1),
plot.title = element_text(hjust = 0.5))
receptors_vs_gradeSave plot
save_plot(receptors_vs_grade,
"05_plot_1.png")Loading Libraries
library("here")
library("tidyverse")
source(here("R/99_proj_func.R"))Reading the Data
data_path <- "data/03_dat_aug.tsv"
dataset <- read_tsv(file = here(data_path))Data Wrangling
dataset <- dataset |>
mutate(
# making grade into a factor -> needed for coloring in graphs
grade = factor(grade,
levels = c(1, 2, 3),
exclude = NULL))Plotting
HER2 vs Aggressiveness
her2_vs_grade <- ggplot(
dataset,
aes(
x = her2_status,
fill = grade)) +
geom_bar(stat = "count",
position = "stack") +
scale_fill_manual(
values = c("1" = "pink",
"2" = "red",
"3" = "darkred",
"NA" = "gray")) +
labs(
title = "HER2 Status vs Aggressiveness",
x = "HER2 Status",
y = "Amount",
fill = "Grade") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45,
hjust = 1),
plot.title = element_text(hjust = 0.5))
her2_vs_gradeSave plot
save_plot(her2_vs_grade,
"06_plot_2.png")Loading Libraries
library("here")
library("tidyverse")
source(here("R/99_proj_func.R"))Reading the Data
data_path <- "data/03_dat_aug.tsv"
dataset <- read_tsv(file = here(data_path))Data Wrangling
dataset <- dataset |>
mutate(
# making tumor_size_before into a factor -> needed for coloring in graphs
tumor_size_before = factor(
tumor_size_before,
levels = c(0, 1, 2, 3, 4),
exclude = NULL))Plotting
HER2 vs Tumor Size
her2_vs_tmrsz <- ggplot(
dataset,
aes(
x = her2_status,
fill = tumor_size_before)) +
geom_bar(stat = "count",
position = "stack") +
scale_fill_manual(
values = c("0" = "#ADD8E6",
"1" = "#87CEFA",
"2" = "#4682B4",
"3" = "#0000CD",
"4" = "#00008B",
"NA" = "gray")) +
labs(
title = "HER2 Status vs Tumor Size Before Treatment",
x = "HER2 Status",
y = "Amount",
fill = "Tumor Size") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45,
hjust = 1),
plot.title = element_text(hjust = 0.5))
her2_vs_tmrszSave plot
save_plot(her2_vs_tmrsz,
"07_plot_3.png")Loading Libraries
library("here")
library("tidyverse")
source(here("R/99_proj_func.R"))Reading the Data
data_path <- "data/03_dat_aug.tsv"
dataset <- read_tsv(file = here(data_path))Data Wrangling
dataset <- dataset |>
mutate(
# making nodes_before into a factor -> needed for coloring in graphs
nodes_before = factor(
nodes_before,
levels = c(0, 1, 2, 3),
exclude = NULL))Plotting
HER2 vs Nodes
her2_vs_nodes <- ggplot(
dataset,
aes(
x = her2_status,
fill = nodes_before)) +
geom_bar(stat = "count",
position = "stack") +
scale_fill_manual(
values = c("0" = "#90EE90",
"1" = "#32CD32",
"2" = "#228B22",
"3" = "#006400",
"NA" = "gray")) +
labs(
title = "HER2 Status vs Node Spread Before Treatment",
x = "HER2 Status",
y = "Amount",
fill = "Node Spread") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45,
hjust = 1),
plot.title = element_text(hjust = 0.5))
her2_vs_nodesSave plot
save_plot(her2_vs_nodes,
"08_plot_4.png")Loading Libraries
library("here")
library("tidyverse")
source(here("R/99_proj_func.R"))Reading the Data
data_path <- "data/03_dat_aug.tsv"
dataset <- read_tsv(file = here(data_path))Data Wrangling
dataset <- dataset |>
mutate(
# creating a new column for the combinations of E and P receptor statuses
er_pr_status = case_when(
er_status == "P" & pr_status == "P" ~ "ER Positive, PR Positive",
er_status == "P" & pr_status == "N" ~ "ER Positive, PR Negative",
er_status == "N" & pr_status == "P" ~ "ER Negative, PR Positive",
er_status == "N" & pr_status == "N" ~ "ER Negative, PR Negative"),
# making response into factor -> needed for coloring in graphs
response = factor(
response,
levels = c("RD", "pCR"),
exclude = NULL))Plotting
ER & PR Status vs Treatment Outcome
receptors_vs_outcome <- ggplot(
dataset,
aes(
x = er_pr_status,
fill = response)) +
geom_bar(stat = "count",
position = "stack") +
scale_fill_manual(
values = c("RD" = "darkred",
"pCR" = "#32CD32")) +
labs(
title = "ER & PR Status vs Treatment Outcome",
x = "Receptor Status Combination",
y = "Amount",
fill = "Response") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45,
hjust = 1),
plot.title = element_text(hjust = 0.5))
receptors_vs_outcomeSave plot
save_plot(receptors_vs_outcome,
"09_plot_5.png")Loading Libraries
library("here")
library("tidyverse")
source(here("R/99_proj_func.R"))Reading the Data
data_path <- "data/03_dat_aug.tsv"
dataset <- read_tsv(file = here(data_path))Data Wrangling
dataset <- dataset |>
mutate(
# making response into factor -> needed for coloring in graphs
response = factor(
response,
levels = c("RD", "pCR"),
exclude = NULL))Plotting
HER2 Status vs Treatment Outcome
her2_vs_outcome <- ggplot(
dataset,
aes(
x = her2_status,
fill = response)) +
geom_bar(stat = "count",
position = "stack") +
scale_fill_manual(
values = c("RD" = "darkred",
"pCR" = "#32CD32")) +
labs(
title = "HER2 Status vs Treatment Outcome",
x = "HER2 Status",
y = "Amount",
fill = "Response") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45,
hjust = 1),
plot.title = element_text(hjust = 0.5))
her2_vs_outcomeSave plot
save_plot(her2_vs_outcome,
"10_plot_6.png")Loading Libraries
library("here")
library("tidyverse")
source(here("R/99_proj_func.R"))Reading the Data
data_path <- "data/03_dat_aug.tsv"
dataset <- read_tsv(file = here(data_path))Data Wrangling
dataset <- dataset |>
mutate(
# making grade and response into factors
# -> for coloring graphs and separating NAs in grade
grade = factor(
grade,
levels = c(1, 2, 3),
exclude = NULL),
response = factor(
response,
levels = c("RD", "pCR"),
exclude = NULL))Plotting
Aggressiveness vs Treatment Outcome
grade_vs_outcome <- ggplot(
dataset,
aes(
x = grade,
fill = response)) +
geom_bar(stat = "count",
position = "stack") +
scale_fill_manual(
values = c("RD" = "darkred",
"pCR" = "#32CD32")) +
labs(
title = "Aggressiveness vs Treatment Outcome",
x = "Grade",
y = "Amount",
fill = "Response") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45,
hjust = 1),
plot.title = element_text(hjust = 0.5))
grade_vs_outcomeSave plot
save_plot(grade_vs_outcome,
"11_plot_7.png")Load libraries
library("here")
library("tidyverse")
source(here("R/99_proj_func.R"))Load data as variables
input_file <- here("data/03_dat_aug.tsv")
dat_aug <- read_tsv(
input_file,
na = c("", "NA"),
show_col_types = FALSE)Graphical Comparison
Outcome compared to age
gg_8 <- dat_aug |>
ggplot(aes(
x = response,
y = age,
fill = response)) +
geom_boxplot() +
geom_jitter(width = 0.15,
alpha = 0.5,
size = 1.6) +
scale_fill_manual(
values = c("RD" = "blue",
"pCR" = "magenta")) +
labs(
x = "Response",
y = "Age",
title = "Distribution of age across outcome") +
theme_minimal(base_size = 14) +
theme(plot.title = element_text(hjust = 0.5))
gg_8Warning: Removed 1 row containing non-finite outside the scale range
(`stat_boxplot()`).
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).
Save plot
save_plot(gg_8,
"12_plot_8.png")Warning: Removed 1 row containing non-finite outside the scale range
(`stat_boxplot()`).
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).
Load libraries
library("here")
library("tidyverse")
source(here("R/99_proj_func.R"))Load data as variables
input_file <- here("data/03_dat_aug.tsv")
dat_aug <- read_tsv(
input_file,
na = c("", "NA"),
show_col_types = FALSE)Age and race compared to aggressiveness and Tumor Size
Plot 1
gg_1 <- ggplot(
dat_aug,
aes(x = age,
y = grade,
color = race,
size = tumor_size_before)) +
geom_point(alpha = 0.7) +
scale_size_continuous(name = "Tumor size") +
labs(
title = "Age vs Tumor Aggressiveness Across Tumor Size and Race",
x = "Age",
y = "Tumor Aggressiveness",
color = "Race") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
gg_1Warning: Removed 11 rows containing missing values or values outside the scale range
(`geom_point()`).
Plot 2
gg_2 <- ggplot(
dat_aug,
aes(
x = factor(grade),
y = age)) +
geom_violin(fill = "steelblue",
alpha = 0.6,
color = NA) +
geom_jitter(
aes(
color = race,
size = tumor_size_before),
width = 0.15,
height = 0,
alpha = 0.7) +
labs(
title = "Age vs Tumor Aggressiveness
Across Tumor Size Before Treatment and Race",
x = "Tumor Aggressiveness",
y = "Age",
color = "Race",
size = "Tumore Size Before") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
gg_2Warning: Removed 1 row containing non-finite outside the scale range
(`stat_ydensity()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).
Plot 3
gg_3 <- ggplot(
dat_aug,
aes(
x = factor(grade),
y = age)) +
geom_boxplot(outlier.shape = NA,
alpha = 0.4) +
geom_jitter(
aes(
color = race,
size = tumor_size_before),
width = 0.15,
height = 0,
alpha = 0.7) +
labs(
title = "Age vs Tumor Aggressiveness
Across Tumor Size Before Treatment and Race",
x = "Tumor Aggressiveness",
y = "Age",
color = "Race",
size = "Tumore Size Before") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
gg_3Warning: Removed 1 row containing non-finite outside the scale range
(`stat_boxplot()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).
Plot 4
gg_4 <- ggplot(
dat_aug,
aes(
age,
fill = race)) +
geom_histogram(alpha = 0.5,
bins = 20,
position = "identity") +
facet_wrap(~ grade) +
theme_minimal() +
labs(
title = "Age vs Tumor Aggressiveness
Across Race, Devided by BR Grade",
x = "Tumor Aggressiveness",
y = "Age",
fill = "Race") +
theme(plot.title = element_text(hjust = 0.5))
gg_4Warning: Removed 1 row containing non-finite outside the scale range
(`stat_bin()`).
Heatmap Age vs. Aggressiveness
Plot 1
df <- dat_aug |>
mutate(
age_bin = cut(
age,
breaks = seq(20,
90,
by = 5)))
gg_5 <- ggplot(
df,
aes(
x = age,
y = grade)) +
stat_bin2d(bins = c(20, 3),
aes(fill = ..count..)) +
scale_fill_viridis_c(name = "Count") +
labs(
title = "Heatmap Age vs Tumor Aggressiveness",
x = "Age",
y = "Aggressiveness") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
gg_5Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
ℹ Please use `after_stat(count)` instead.
Warning: Removed 11 rows containing non-finite outside the scale range
(`stat_bin2d()`).
Plot 2
gg_6 <- df |>
mutate(age_bin = cut(
age,
breaks = seq(20,
90,
5))) |>
group_by(age_bin,
grade) |>
summarise(mean_size = mean(
tumor_size_before,
na.rm = TRUE)) |>
ggplot(aes(
age_bin,
factor(grade),
fill = mean_size)) +
geom_tile(color = "white") +
labs(
title = "Heatmap of Mean Tumor Size by Age & Aggressiveness",
x = "Age",
y = "Tumor Aggressiveness",
fill = "Mean size") +
theme_minimal(base_size = 14) +
theme(axis.text.x = element_text(angle = 45,
hjust = 1),
plot.title = element_text(hjust = 0.5))`summarise()` has grouped output by 'age_bin'. You can override using the
`.groups` argument.
gg_6Plot 3
gg_7 <- ggplot(
df,
aes(
x = age,
y = grade)) +
stat_density_2d_filled(contour_var = "ndensity") +
labs(
title = "2D Density Heatmap: Age vs Tumor Aggressiveness",
x = "Age",
y = "Tumor Aggressiveness") +
theme_minimal(base_size = 14) +
theme(plot.title = element_text(hjust = 0.5))
gg_7Warning: Removed 11 rows containing non-finite outside the scale range
(`stat_density2d_filled()`).
Save all plots
save_plot(gg_1,
"13_plot_9_1.png")Warning: Removed 11 rows containing missing values or values outside the scale range
(`geom_point()`).
save_plot(gg_2,
"13_plot_9_2.png")Warning: Removed 1 row containing non-finite outside the scale range
(`stat_ydensity()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).
save_plot(gg_3,
"13_plot_9_3.png")Warning: Removed 1 row containing non-finite outside the scale range (`stat_boxplot()`).
Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).
save_plot(gg_4,
"13_plot_9_4.png")Warning: Removed 1 row containing non-finite outside the scale range
(`stat_bin()`).
save_plot(gg_5,
"13_plot_9_5.png")Warning: Removed 11 rows containing non-finite outside the scale range
(`stat_bin2d()`).
save_plot(gg_6,
"13_plot_9_6.png")
save_plot(gg_7,
"13_plot_9_7.png")Warning: Removed 11 rows containing non-finite outside the scale range
(`stat_density2d_filled()`).
Load Libraries
library("here")
library("tidyverse")
source(here("R/99_proj_func.R"))Load Data into variable
input_file <- here("data/03_dat_aug.tsv")
dat_aug <- read_tsv(
input_file,
na = c("", "NA"),
show_col_types = FALSE)Tumorsize, Lymphnodes and Aggression Comparison
gg_1 <- ggplot(
dat_aug,
aes(
x = tumor_size_before,
y = nodes_before,
color = grade)) +
geom_jitter(width = 0.2,
height = 0.2,
size = 3) +
theme_minimal() +
labs(
title = "Tumor size vs. Lymphnodes and Aggression of Tumor",
x = "Tumor Sizw",
y = "Lymphnodes") +
theme(plot.title = element_text(hjust = 0.5))
gg_1Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).
gg_2 <- dat_aug |>
count(tumor_size_before,
nodes_before,
grade) |>
ggplot(aes(
x = tumor_size_before,
y = nodes_before,
fill = n)) +
geom_tile(color = "white") +
facet_wrap(~grade) +
theme_minimal() +
labs(
title = "Amount of Tumor Sizes and Lymphnodes according to Aggression",
x = "Tumorsize",
y = "Lymphnodes",
fill = "Amount") +
theme(plot.title = element_text(hjust = 0.5))
gg_2Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_tile()`).
Save plots
save_plot(gg_1,
"14_plot_10_1.png")Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).
save_plot(gg_2,
"14_plot_10_2.png")Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_tile()`).
Load Libraries
library("here")
library("tidyverse")
source(here("R/99_proj_func.R"))Load Data
input_file <- here("data/03_dat_aug.tsv")
dat_aug <- read_tsv(
input_file,
na = c("", "NA"),
show_col_types = FALSE)Response to Treatment according to Receptor Status
gg_11 <- dat_aug |>
count(receptor_group,
response) |>
group_by(receptor_group) |>
mutate(
percent = n / sum(n) * 100) |>
ggplot(aes(
x = receptor_group,
y = percent,
fill = response)) +
geom_col(position = "stack") +
theme_minimal() +
labs(
title = "Response to Treatment according to Receptor Status",
x = "Receptors",
y = "Percent",
fill = "Response")+
theme(axis.text.x = element_text(angle = 45,
hjust = 1,
vjust = 1.2),
plot.title = element_text(hjust = 0.5))
gg_11Save plot
save_plot(gg_11,
"15_plot_11.png")Load Libraries
library("here")
library("tidyverse")
source(here("R/99_proj_func.R"))Load Data
input_file <- here("data/03_dat_aug.tsv")
dat_aug <- read_tsv(
input_file,
na = c("", "NA"),
show_col_types = FALSE)Plot
gg_1 <- ggplot(
dat_aug,
aes(
x = her2_status,
y = her2_fish,
fill = her2_status)) +
geom_boxplot() +
theme_minimal() +
labs(
title = "HER2-FISH and HER2 receptor Status Comparison",
x = "HER2 Receptor",
y = "HER2-FISH") +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5))
gg_1Warning: Removed 49 rows containing non-finite outside the scale range
(`stat_boxplot()`).
gg_2 <- ggplot(
dat_aug,
aes(
x = as.numeric(tumor_size_before),
y = her2_fish)) +
geom_jitter(width = 0.2,
alpha = 0.5) +
geom_smooth(method = "lm",
se = TRUE,
color = "blue") +
theme_minimal() +
labs(
x = "Tumorsize",
y = "HER2-FISH",
title = "Linear Regression: HER2-FISH vs Tumorgsize") +
theme(plot.title = element_text(hjust = 0.5))
gg_2`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 49 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 49 rows containing missing values or values outside the scale range
(`geom_point()`).
Save plots
save_plot(gg_1,
"16_plot_12_1.png")Warning: Removed 49 rows containing non-finite outside the scale range
(`stat_boxplot()`).
save_plot(gg_2,
"16_plot_12_2.png")`geom_smooth()` using formula = 'y ~ x'
Warning: Removed 49 rows containing non-finite outside the scale range
(`stat_smooth()`).
Warning: Removed 49 rows containing missing values or values outside the scale range
(`geom_point()`).
Load Libraries
library("here")
library("tidyverse")
source(here("R/99_proj_func.R"))Load Data
input_file <- here("data/03_dat_aug.tsv")
dat_aug <- read_tsv(
input_file,
na = c("", "NA"),
show_col_types = FALSE)PCA
Set up the dataset
df_clean <- dat_aug |>
select(
her2_num,
er_num,
pr_num,
age) |>
drop_na()
df_scaled <- df_clean |>
map_dfc(~ (. - mean(.)) / sd(.))
pca_res <- prcomp(df_scaled,
center = FALSE,
scale. = FALSE)
pca_df <- as_tibble(pca_res$x) |>
bind_cols(df_clean)Plot PCAs
gg_1 <- pca_df |>
ggplot(aes(
x = PC1,
y = PC2,
color = factor(er_num))) +
geom_point(size = 3,
alpha = 0.7) +
labs(
title = "PCA Plot of ER Status",
color = "ER Status") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
gg_2 <- pca_df |>
ggplot(aes(
x = PC1,
y = PC2,
color = factor(pr_num))) +
geom_point(size = 3,
alpha = 0.7) +
labs(
title = "PCA Plot of PR Status",
color = "PR Status") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
gg_3 <- pca_df |>
ggplot(aes(
x = PC1,
y = PC2,
color = factor(her2_num))) +
geom_point(size = 3,
alpha = 0.7) +
labs(
title = "PCA Plot of HER2 Status",
color = "HER2 Status") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
gg_1gg_2gg_3Save plots
save_plot(gg_1,
"17_plot_13_1.png")
save_plot(gg_2,
"17_plot_13_2.png")
save_plot(gg_3,
"17_plot_13_3.png")Running Code
Loading libraries:
library("here")
library("tidyverse")
library("scales")
source(here("R/99_proj_func.R"))Read data:
dataset <- read_tsv(file = here("data/03_dat_aug.tsv"))Re-ordering
Ordering the different genotypes into groups from the augmented data:
# Create ER/PR/HER2 combo groups
dataset <- dataset |>
mutate(
ER = if_else(
er_status == "P",
"ER+",
"ER-"),
PR = if_else(
pr_status == "P",
"PR+",
"PR-"),
HER2 = if_else(
her2_status == "P",
"HER2+",
"HER2-"),
combo = paste(
ER,
PR,
HER2,
sep = ", ")) |>
mutate(
combo = factor(
combo,
levels = c(
"ER+, PR-, HER2-",
"ER+, PR+, HER2-",
"ER+, PR-, HER2+",
"ER+, PR+, HER2+",
"ER-, PR+, HER2-",
"ER-, PR+, HER2+",
"ER-, PR-, HER2+",
"ER-, PR-, HER2-")))
# check: how many rows total?
nrow(dataset)[1] 278
# check: how many patients in each genotype?
dataset |>
count(combo)# A tibble: 8 × 2
combo n
<fct> <int>
1 ER+, PR-, HER2- 46
2 ER+, PR+, HER2- 94
3 ER+, PR-, HER2+ 8
4 ER+, PR+, HER2+ 16
5 ER-, PR+, HER2- 8
6 ER-, PR+, HER2+ 3
7 ER-, PR-, HER2+ 32
8 ER-, PR-, HER2- 71
Graph 1:
Response (pCR vs RD) by ER/PR/HER2 Gentypes
pCR = pathological complete response, meaning no residual invasive cancer found in the breast or lymph nodes at the time of surgery, It means the treatment eliminated all detectable invasive tumor. This is a strong positive prognostic indicator.
RD = residual disease, meaning invasive tumor remains in the breast and/or lymph nodes after treatment. It means the cancer did not completely respond to treatment. Indicates relative treatment resistance. RD is considered a non-responder outcome.
# total number of patients
N_total <- nrow(dataset)
# Count N per genotype (combo) group
group_counts_resp <- dataset |>
group_by(combo) |>
summarise(N_group = n(),
.groups = "drop")
custom_palette <- c("#e44b8d",
"#90C84C")
gg_14 <- ggplot(
dataset,
aes(
x = combo,
fill = response)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = percent_format()) +
# show all 8 levels even if some are empty
scale_x_discrete(drop = FALSE) +
# add N labels above each bar
geom_text(
data = group_counts_resp,
aes(
x = combo,
y = 1.05,
label = paste0("N=", N_group)),
size = 3,
inherit.aes = FALSE) +
labs(
title = "Response (pCR=good vs RD=bad) by ER/PR/HER2 Genotypes",
x = "ER/PR/HER2 Genotypes",
y = paste0("Proportion of Patients (N = ", N_total, ")"),
fill = "Response") +
scale_fill_manual(values = custom_palette) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45,
hjust = 1,
size = 8),
plot.title = element_text(hjust = 0.5),
axis.title.x = element_text(vjust = 0)) +
coord_cartesian(ylim = c(0, 1.1))
gg_14Save plot
save_plot(gg_14,
"18_key_plot_14.png")Conclusions:
Estrogen receptor-positive breast cancer patients have a worse treatment response (Graph 1).
Running Code
Loading libraries:
library("here")
library("tidyverse")
library("scales")
source(here("R/99_proj_func.R"))Read data:
dataset <- read_tsv(file = here("data/03_dat_aug.tsv"))Re-ordering
Ordering the different genotypes into groups from the augmented data:
# Create ER/PR/HER2 combo groups
dataset <- dataset |>
mutate(
ER = if_else(
er_status == "P",
"ER+",
"ER-"),
PR = if_else(
pr_status == "P",
"PR+",
"PR-"),
HER2 = if_else(
her2_status == "P",
"HER2+",
"HER2-"),
combo = paste(
ER,
PR,
HER2,
sep = ", ")) |>
mutate(
combo = factor(
combo,
levels = c(
"ER+, PR-, HER2-",
"ER+, PR+, HER2-",
"ER+, PR-, HER2+",
"ER+, PR+, HER2+",
"ER-, PR+, HER2-",
"ER-, PR+, HER2+",
"ER-, PR-, HER2+",
"ER-, PR-, HER2-")))
# check: how many rows total?
nrow(dataset)[1] 278
# check: how many patients in each genotype?
dataset |>
count(combo)# A tibble: 8 × 2
combo n
<fct> <int>
1 ER+, PR-, HER2- 46
2 ER+, PR+, HER2- 94
3 ER+, PR-, HER2+ 8
4 ER+, PR+, HER2+ 16
5 ER-, PR+, HER2- 8
6 ER-, PR+, HER2+ 3
7 ER-, PR-, HER2+ 32
8 ER-, PR-, HER2- 71
Graph 2:
Tumor Grade (1–3) by ER/PR/HER2 Genotypes
# Prepare data with grade as factor (1,2,3,NA)
dat_grade <- dataset |>
mutate(
grade = as.character(grade),
grade = if_else(is.na(grade),
"NA",
grade),
grade = factor(grade,
levels = c("1", "2", "3", "NA")))
# total N (same as before, but explicit)
N_total_grade <- nrow(dat_grade)
# N per genotype
group_counts_grade <- dat_grade |>
group_by(combo) |>
summarise(N_group = n(),
.groups = "drop")
custom_palette <- c("#e44b8d",
"lightblue",
"#90C84C",
"gray")
gg_15 <- ggplot(
dat_grade,
aes(
x = combo,
fill = grade)) +
geom_bar(position = "fill") +
scale_x_discrete(drop = FALSE) +
scale_y_continuous(labels = percent_format()) +
# N labels above bars
geom_text(
data = group_counts_grade,
aes(
x = combo,
y = 1.05,
label = paste0("N=",
N_group)),
size = 3,
inherit.aes = FALSE) +
labs(
title = "Tumor Grade (1–3) by ER/PR/HER2 Genotypes",
x = "ER/PR/HER2 Genotypes",
y = paste0("Proportion of Patients (N = ", N_total_grade, ")"),
fill = "Grade") +
scale_fill_manual(values = custom_palette) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45,
hjust = 1,
size = 8),
plot.title = element_text(hjust = 0.5),
axis.title.x = element_text(vjust = 0)) +
coord_cartesian(ylim = c(0, 1.1))
gg_15Save plot
save_plot(gg_15,
"19_key_plot_15.png")Conclusions:
Even tough being postitive for Estrogen receptors do have the worst outcome (Graph 1), they do not have a more agressive tumor grade (Graph 2). Being negative for both hormone receptors leads to a more aggressive tumor grade (Graph 2), and those two actually had the worst outcome (Graph 1).
Running Code
Loading libraries:
library("here")
library("tidyverse")
library("scales")
source(here("R/99_proj_func.R"))Read data:
dataset <- read_tsv(file = here("data/03_dat_aug.tsv"))Re-ordering
Ordering the different genotypes into groups from the augmented data:
# Create ER/PR/HER2 combo groups
dataset <- dataset |>
mutate(
ER = if_else(
er_status == "P",
"ER+",
"ER-"),
PR = if_else(
pr_status == "P",
"PR+",
"PR-"),
HER2 = if_else(
her2_status == "P",
"HER2+",
"HER2-"),
combo = paste(
ER,
PR,
HER2,
sep = ", ")) |>
mutate(
combo = factor(
combo,
levels = c(
"ER+, PR-, HER2-",
"ER+, PR+, HER2-",
"ER+, PR-, HER2+",
"ER+, PR+, HER2+",
"ER-, PR+, HER2-",
"ER-, PR+, HER2+",
"ER-, PR-, HER2+",
"ER-, PR-, HER2-")))
# check: how many rows total?
nrow(dataset)[1] 278
# check: how many patients in each genotype?
dataset |>
count(combo)# A tibble: 8 × 2
combo n
<fct> <int>
1 ER+, PR-, HER2- 46
2 ER+, PR+, HER2- 94
3 ER+, PR-, HER2+ 8
4 ER+, PR+, HER2+ 16
5 ER-, PR+, HER2- 8
6 ER-, PR+, HER2+ 3
7 ER-, PR-, HER2+ 32
8 ER-, PR-, HER2- 71
Graph 3:
Prepare for heatmap plot to look at the tumor size (0-4) among genotypes:
# 1. Make a clean tumor-size variable with NA as category
dataset_ts <- dataset |>
mutate(
ts = ifelse(
is.na(tumor_size_before),
"NA",
as.character(tumor_size_before)))
# all genotype levels (from your combo factor) and all tumor-size levels
all_combos <- levels(dataset$combo)
ts_levels <- c("0", "1", "2", "3", "4", "NA")
# 2. Create full grid: every combo × every tumor-size level
full_grid_ts <-
expand.grid(
combo = all_combos,
ts = ts_levels,
stringsAsFactors = FALSE)
# 3. Summary: counts & percentages per combo × tumor size
ts_summary <-
full_grid_ts |>
left_join(
dataset_ts |>
count(combo, ts),
by = c("combo", "ts")) |>
mutate(
n = ifelse(
is.na(n),
0,
n)) |>
group_by(combo) |>
mutate(
total = sum(n),
pct = ifelse(
total > 0,
n / total,
0)) |>
ungroup()
# factor for tumor size (plot rows)
ts_summary$ts <- factor(
ts_summary$ts,
levels = ts_levels)
# 4. N per combo (for labels)
group_counts_ts <-
ts_summary |>
distinct(combo,
total) |>
rename(N_group = total)
# 5. Extended factor for plotting: add an extra row "N" under 0 just for labels
ts_levels_with_N <- c(ts_levels, "N")
ts_summary <- ts_summary |>
mutate(
ts_plot = factor(
as.character(ts),
levels = ts_levels_with_N))
n_label_df <- group_counts_ts |>
mutate(
ts_plot = factor(
"N",
levels = ts_levels_with_N))Graph - Heatmap plot - Tumor Size Before Treatment by ER/PR/HER2 Genotypes
gg_16 <- ggplot() +
# heatmap tiles
geom_tile(
data = ts_summary,
aes(
x = combo,
y = ts_plot,
fill = pct)) +
# percentage labels inside tiles
geom_text(
data = ts_summary,
aes(
x = combo,
y = ts_plot,
label = paste0(round(pct * 100),
"%")),
size = 3) +
# N labels in their own row under the heatmap
geom_text(
data = n_label_df,
aes(
x = combo,
y = ts_plot,
label = paste0("N=",
N_group)),
size = 3) +
scale_fill_gradient(
low = "#e5a0c6",
high = "#4F9A2F",
labels = percent_format(accuracy = 1),
name = "Proportion") +
# keep all genotypes, even if some have few patients
scale_x_discrete(drop = FALSE) +
# y labels: hide label for the extra "N" row with ""
scale_y_discrete(
breaks = ts_levels_with_N,
labels = c("0", "1", "2", "3", "4", "NA", "")) +
labs(
title = "Tumor Size Before Treatment by ER/PR/HER2 Genotypes",
x = "ER/PR/HER2 Genotypes",
y = "Tumor size before surgery") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45,
hjust = 1,
size = 8),
plot.title = element_text(hjust = 0.5),
axis.title.x = element_text(vjust = 0))
gg_16Save plot
save_plot(gg_16,
"20_key_plot_16.png")Conclusion:
Far most of the genotype groups have a size 2 tumor stage, or otherwise they have developed into size 3-4. This is especially the case for groups that are Estrogen receptor positive, and the last two groups which are negative for both hormone receptors.
Running Code
Loading libraries:
library("here")
library("tidyverse")
library("scales")
source(here("R/99_proj_func.R"))Read data:
dataset <- read_tsv(file = here("data/03_dat_aug.tsv"))Re-ordering
Ordering the different genotypes into groups from the augmented data:
# Create ER/PR/HER2 combo groups
dataset <- dataset |>
mutate(
ER = if_else(
er_status == "P",
"ER+",
"ER-"),
PR = if_else(
pr_status == "P",
"PR+",
"PR-"),
HER2 = if_else(
her2_status == "P",
"HER2+",
"HER2-"),
combo = paste(
ER,
PR,
HER2,
sep = ", ")) |>
mutate(
combo = factor(
combo,
levels = c(
"ER+, PR-, HER2-",
"ER+, PR+, HER2-",
"ER+, PR-, HER2+",
"ER+, PR+, HER2+",
"ER-, PR+, HER2-",
"ER-, PR+, HER2+",
"ER-, PR-, HER2+",
"ER-, PR-, HER2-")))
# check: how many rows total?
nrow(dataset)[1] 278
# check: how many patients in each genotype?
dataset |>
count(combo)# A tibble: 8 × 2
combo n
<fct> <int>
1 ER+, PR-, HER2- 46
2 ER+, PR+, HER2- 94
3 ER+, PR-, HER2+ 8
4 ER+, PR+, HER2+ 16
5 ER-, PR+, HER2- 8
6 ER-, PR+, HER2+ 3
7 ER-, PR-, HER2+ 32
8 ER-, PR-, HER2- 71
Graph 4:
Prepare for a bubble plot to look at the lymph nodes among genotype groups:
# 1. Clean node categories (0–3 + NA)
dat_nodes <- dataset |>
mutate(
nodes_cat = if_else(
is.na(nodes_before),
"NA",
as.character(nodes_before)))
# 2. Summarise counts and percentages per combo × node category
nodes_summary <- dat_nodes |>
count(combo,
nodes_cat,
name = "n") |>
group_by(combo) |>
mutate(
total = sum(n),
pct = n / total) |>
ungroup()
# 3. N per combo (for labels)
group_counts_nodes <- nodes_summary |>
distinct(combo,
total) |>
rename(N_group = total)
# 4. Y-axis levels for plotting:
# bottom → top: 3, 2, 1, 0, NA, N(row for labels)
y_levels <- c("0", "1", "2", "3", "NA", "N")
# Add plotting factor for nodes
nodes_summary <- nodes_summary |>
mutate(
nodes_y = factor(
nodes_cat,
levels = y_levels))
# Data frame for N labels row
n_label_df <- group_counts_nodes |>
mutate(
nodes_y = factor(
"N",
levels = y_levels))
head(nodes_summary)# A tibble: 6 × 6
combo nodes_cat n total pct nodes_y
<fct> <chr> <int> <int> <dbl> <fct>
1 ER+, PR-, HER2- 0 10 46 0.217 0
2 ER+, PR-, HER2- 1 25 46 0.543 1
3 ER+, PR-, HER2- 2 3 46 0.0652 2
4 ER+, PR-, HER2- 3 8 46 0.174 3
5 ER+, PR+, HER2- 0 37 94 0.394 0
6 ER+, PR+, HER2- 1 40 94 0.426 1
group_counts_nodes# A tibble: 8 × 2
combo N_group
<fct> <int>
1 ER+, PR-, HER2- 46
2 ER+, PR+, HER2- 94
3 ER+, PR-, HER2+ 8
4 ER+, PR+, HER2+ 16
5 ER-, PR+, HER2- 8
6 ER-, PR+, HER2+ 3
7 ER-, PR-, HER2+ 32
8 ER-, PR-, HER2- 71
Graph - Bubble plot - Lymph Node Status Before Treatment by ER/PR/HER2 Genotypes
gg_17 <- ggplot() +
# Bubbles: size & colour = percentage
geom_point(
data = nodes_summary,
aes(
x = combo,
y = nodes_y,
size = pct,
colour = pct),
alpha = 0.9) +
# N labels in their own row ABOVE NA
geom_text(
data = n_label_df,
aes(
x = combo,
y = nodes_y,
label = paste0("N=", N_group)),
size = 3,) +
# bubble size = percentage (hide legend)
scale_size(
range = c(3, 10),
labels = percent_format(accuracy = 1),
guide = "none") +
# bubble colour = percentage (red gradient)
scale_colour_gradient(
low = "#e2619f",
high = "#90C84C",
labels = percent_format(accuracy = 1),
name = "Proportion") +
# keep all 8 genotype groups
scale_x_discrete(drop = FALSE) +
# y labels: from TOP to BOTTOM = NA, 3, 2, 1, 0; "" hides N
scale_y_discrete(
limits = y_levels,
labels = c("0", "1", "2", "3", "NA", "")) +
labs(
title = "Lymph Node Status Before Treatment by ER/PR/HER2 Genotypes",
x = "ER/PR/HER2 Genotypes",
y = "Nodes before surgery") +
coord_cartesian(ylim = c(0.2,
length(y_levels) + 0.7)) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45,
hjust = 0.8,
size = 8),
plot.title = element_text(hjust = 0.5),
axis.title.x = element_text(vjust = 0))
gg_17Save plot
save_plot(gg_17,
"21_key_plot_17.png")Conclusion:
Generally it seems to be worst to have these three genotypes:
ER+, PR-, HER2-
ER-PR-, HER+2
ER-, PR-, HER2- (triple negative)
As they have more affected lymph nodes.
Running Code
Loading libraries:
library("here")
library("tidyverse")
library("scales")
source(here("R/99_proj_func.R"))Read data:
dataset <- read_tsv(file = here("data/03_dat_aug.tsv"))Re-ordering
Ordering the different genotypes into groups from the augmented data:
# Create ER/PR/HER2 combo groups
dataset <- dataset |>
mutate(
ER = if_else(
er_status == "P",
"ER+",
"ER-"),
PR = if_else(
pr_status == "P",
"PR+",
"PR-"),
HER2 = if_else(
her2_status == "P",
"HER2+",
"HER2-"),
combo = paste(
ER,
PR,
HER2,
sep = ", ")) |>
mutate(
combo = factor(
combo,
levels = c(
"ER+, PR-, HER2-",
"ER+, PR+, HER2-",
"ER+, PR-, HER2+",
"ER+, PR+, HER2+",
"ER-, PR+, HER2-",
"ER-, PR+, HER2+",
"ER-, PR-, HER2+",
"ER-, PR-, HER2-")))
# check: how many rows total?
nrow(dataset)[1] 278
# check: how many patients in each genotype?
dataset |>
count(combo)# A tibble: 8 × 2
combo n
<fct> <int>
1 ER+, PR-, HER2- 46
2 ER+, PR+, HER2- 94
3 ER+, PR-, HER2+ 8
4 ER+, PR+, HER2+ 16
5 ER-, PR+, HER2- 8
6 ER-, PR+, HER2+ 3
7 ER-, PR-, HER2+ 32
8 ER-, PR-, HER2- 71
Graph 5:
Prep: treatment among genotypes
# Clean treatment variable
dat_treat <- dataset |>
mutate(
treatment = as.character(treatment),
treatment = if_else(
is.na(treatment),
"NA",
treatment))
# Summarise: count + proportions per genotype × treatment
treat_summary <- dat_treat |>
group_by(combo,
treatment) |>
summarise(n = n(),
.groups = "drop") |>
group_by(combo) |>
mutate(
total = sum(n),
pct = n / total) |>
ungroup()
# N per combo (for labels)
group_counts_treat <- treat_summary |>
distinct(combo,
total) |>
rename(N_group = total)
# Keep the same genotype order you already defined
treat_summary$combo <- factor(
treat_summary$combo,
levels = levels(dataset$combo))Graph - Treatment Distribution Across ER/PR/HER2 Genotypes
# Construct color palett
treatment_colors <- c(
"FAC" = "#7f3667",
"FACT" = "#bb437e",
"FACT+XRT/X" = "#e44b8d",
"FEC" = "#e2619f",
"TFAC" = "#e5a0c6",
"FECT" = "lightblue",
"TFAC/HT" = "#B3D84C",
"TFEC" = "#90C84C",
"TH/FAC" = "#70B83F",
"TH/FEC" = "#4F9A2F",
"Tonly" = "#1D6510",
"TXFAC" = "#004400",
"NA" = "grey")gg_18 <- ggplot(
treat_summary,
aes(
x = combo,
y = pct,
fill = treatment)) +
# N labels above bars
geom_text(
data = group_counts_treat,
aes(
x = combo,
y = 1.07,
label = paste0("N=",
N_group)),
size = 3,
inherit.aes = FALSE) +
# Stacked bars
geom_col(position = "fill") +
# Percent labels
geom_text(
aes(
label = ifelse(
pct > 0.02,
paste0(round(pct * 100), "%"),
"")),
position = position_fill(vjust = 0.5),
size = 2.5) +
# Apply grey for NA + colors for other treatments
scale_fill_manual(values = treatment_colors) +
scale_x_discrete(drop = FALSE) +
scale_y_continuous(labels = percent_format()) +
labs(
title = "Treatment Distribution Across ER/PR/HER2 Genotypes",
x = "ER/PR/HER2 Genotype",
y = "Proportion of Patients",
fill = "Treatment") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 20,
hjust = 1,
size = 8),
plot.title = element_text(hjust = 0.5),
axis.title.x = element_text(vjust = 0),
legend.position = "bottom",
legend.title = element_text(size = 10),
legend.text = element_text(size = 8),
legend.box.spacing = unit(0.2, "lines"),
legend.key.size = unit(3.5, "mm")) +
guides(fill = guide_legend(
ncol = 7,
byrow = TRUE,
title.position = "top")) +
coord_cartesian(ylim = c(0, 1.1))
gg_18Save plot
save_plot(gg_18,
"22_key_plot_18.png")Conclusion:
Generally, more different treatments were tried in the last two groups, especially for the second last group, which also had the best response. The second group from the left (ER+, PR+, HER2-) also had more treatments but showed a bad response. So in conclusion, (1) more personalized treatment options give better response, (2) and a general broad TFAC and TFEC is not working for estrogen-positive patients.
Aggressiveness, tumor size, and nodes do not necessarily mean a bad outcome. With personalized treatments, it can mean good results for the patient even if the stage or grade of the tumor seems to have a theoretically bad prognosis.
Load Libraries
library("tidyverse")
library("here")
source(here("R/99_proj_func.R"))Load Data
load the data as a variable to use in descriptive graphs
augment_data <-
read_tsv(file = here("data/03_dat_aug.tsv"))Age vs. Genotype
df_plot1 <- augment_data |>
mutate(
age_group = cut(
age,
breaks = seq(0, 100, by = 10),
labels = paste(seq(0, 90, 10),
seq(9, 99, 10),
sep = "-"))) |>
filter(!is.na(age_group)) |>
count(age_group,
receptor_group)
custom_palette <- c("#a53e76",
"#d24787",
"#e2619f",
"#e5a0c6",
"#B3D84C",
"#90C84C",
"#4F9A2F",
"#1D6510")
gg_19 <- ggplot(
df_plot1,
aes(
x = age_group,
y = n,
fill = receptor_group)) +
geom_col(position = "stack") +
labs(
title = "Distribution of Age Groups and Genotypes",
y = "Amount",
x = "Age groups",
fill = "Receptor groups") +
scale_fill_manual(values = custom_palette) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
gg_19Save plot
save_plot(gg_19,
"23_key_plot_19.png")Table of Contents of Analysis files:
01 - E and P receptor statuses vs cancer aggressiveness
02 - HER2 receptor status vs cancer aggressiveness
03 - HER2 receptor status vs tumor size before treatment
04 - HER2 receptor status vs nodes before treatment
05 - E and P receptor statuses vs treatment outcome
06 - HER2 receptor status vs treatment outcome
07 - Cancer aggressiveness vs treatment outcome
08 - Age vs. Outcome
09 - Age and Race vs. Aggressivness and Tumor Size
10 - Tumorsize, Lymphnodes and Aggression Comparison
11 - Response to Treatment according to Receptor Status
12 - Her2 fish vs. Her2 receptor positive or negative and linear regression of her2 and tumor size
13 - PCA with colouring according to receptor positiveness or negativeness
14 - Response (pCR vs RD) by ER/PR/HER2 Genotypes
15 - Tumor Grade (1–3) by ER/PR/HER2 Genotypes
16 - Tumor Size Before Treatment by ER/PR/HER2 Genotypes
17 - Lymph Node Status Before Treatment by ER/PR/HER2 Genotypes
18 - Treatment Distribution Across ER/PR/HER2 Genotypes
19 - Age and Genotype Comparison